# -*- coding: utf-8 -*-
"""
@Time ： 2024/4/18 11:45
@Auth ： fcq
@File ：process_sentiment_mask.py
@IDE ：PyCharm
@Motto：ABC(Always Be Coding)
"""
import json
import nltk
from gensim.utils import simple_preprocess

if __name__ == "__main__":
    negative_words_path = '../opinion_lexicon_English/negative-words.txt'
    positive_words_path = '../opinion_lexicon_English/positive-words.txt'
    with open(negative_words_path, 'r', encoding='utf-8') as f:
        negative_words = f.read().splitlines()
    with open(positive_words_path, 'r', encoding='utf-8') as f:
        positive_words = f.read().splitlines()
    sentiment_words = negative_words + positive_words
    data_path = '../VAST/vast_test_cross_senti.json'
    with open(data_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    for idx in range(len(data)):
        text = data[idx]['text']
        text_list = text.split()
        sentence = []
        for token in text_list:
            if token in sentiment_words:
                sentence.append('[MASK]')
            else:
                sentence.append(token)
        data[idx]['sentiment_mask_text'] = ' '.join(sentence)

    with open('../VAST/vast_test_cross_senti_mask.json', 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False)
